; some sorting routines
; implementation in fasm = macgub
;==============================================================================
sort_hybrid_dd:
; in:  esi - 1st table
;      edi - sec table
;      ecx - tables_units count - unit 4 bytes
; out:
;      sorted first table
   push        ebp
   mov         ebp,esp
   .first_tab  equ  [ebp-4]
   .sec_tab    equ  dword[ebp-8]
   .units_cnt  equ  dword[ebp-12]

   push    esi
   push    edi
   push    ecx
   sub     esp,8
   cmp     ecx,100
   jna     @f
   call    mix_tab_d          ; random list if needed
  @@:
   cmp     .units_cnt,5000
   jna     @f
   movd    xmm5,esp           ; change/increase stack
   mov     esp,.sec_tab
   mov     ebx,.units_cnt
   shl     ebx,2
   add     esp,ebx
  @@:
   mov     eax,1
   mov     ebx,.units_cnt
   dec     ebx
   movd    xmm7,.first_tab
   call    qsort_hybrid_d
   cmp     .units_cnt,5000
   jna     @f
   movd    esp,xmm5
  @@:
; sort  again
; next  phase
   mov     ebx,.first_tab    ; insert sort
   mov     ecx,.units_cnt
   mov     esi,ecx
   dec     ecx
   shl     esi,2
   add     esi,ebx
 .ccc:                            ; ebx    - vert index
   mov     eax,[ebx+4]             ; ebx+4  - ed index
   cmp     eax,[ebx]
   jge     .g
   movss   xmm0,[ebx+4]
   push    ebx
 .c111:
   cmp     ebx,esi
   jae     .done
   cmp     ebx,.first_tab
   jb      .done
   cmp     eax,[ebx]
   jae     .done
   movlps  xmm7,[ebx]
   movss   [ebx+4],xmm7
;  push    dword[ebx]
;  pop     dword[ebx+4]
   sub     ebx,4
   jnz     .c111
   add     ebx,4
 .done:
   movss   [ebx+4],xmm0
 .p:
   pop     ebx
 .g:
   add     ebx,4
   loop    .ccc
 .end:
    mov    esp,ebp
    pop    ebp
ret
;========================================================
sort_hybrid:
; in:  esi - 1st table
;      edi - sec table
;      ecx - tables_units count - unit 8 bytes
;            sort key -> first unit dword
; out:
;      sorted first table
   push        ebp
   mov         ebp,esp
   .first_tab  equ  [ebp-4]
   .sec_tab    equ  [ebp-8]
   .units_cnt  equ  dword[ebp-12]
   push     esi
   push     edi
   push     ecx
   cmp      ecx,100
   jna      @f
   call     mix_tab
 @@:
   cmp      .units_cnt,5000     ; stack to increase ?
   jna      @f
   movd     xmm5,esp
   mov      esp,.sec_tab
   mov      ebx,.units_cnt
   shl      ebx,3
   add      esp,ebx
@@:
   mov      eax,1
   mov      ebx,.units_cnt
   dec      ebx
   movd     xmm7,.first_tab
   call     qsort_hybrid    ; qword - one unit
   cmp      .units_cnt,5000
   jna      @f
   movd     esp,xmm5
  @@:
; sort again
; next phase
   mov      ebx,.first_tab
   mov      ecx,.units_cnt
   mov      esi,ecx
   shl      esi,3
   add      esi,ebx
 .ccc:                                  ; ebx    - vert index
   mov      eax,[ebx+8]                 ; ebx+4  - tri index
   cmp      eax,[ebx]
   jge      .g
   movlps   xmm0,[ebx+8]
   push     ebx
 .c22:
   cmp      ebx,esi
   jae      .done
   cmp      ebx,.first_tab
   jb       .done
   cmp      eax,[ebx]
   jae      .done
   movlps   xmm7,[ebx]
   movlps   [ebx+8],xmm7
   sub      ebx,8
   jnc      .c22
   add      ebx,8
 .done:
   movlps   [ebx+8],xmm0
 .p:
   pop      ebx
 .g:
   add      ebx,8
   dec      ecx
   cmp      ecx,1
   jnz      .ccc
  .end:
   mov      esp,ebp
   pop      ebp
ret
;==================================================================================
;========================================
qsort_hybrid:
; sort only to cutoff moment - such pre sorted tab
; - sort again using insert sort
; in:
;       eax - first
;       ebx - last
;       xmm7 - table
     push    ebp
     mov     ebp,esp
    .i       equ dword[ebp-4]
    .last    equ dword[ebp-8]
    .first   equ dword[ebp-12]
    .j       equ dword[ebp-16]
     push    eax
     push    ebx
     push    eax
     push    ebx
     mov     edx,ebx
     sub     edx,eax
     cmp     edx,5
     jl      .nxp        ; time to next phase sorting ?
     movd    esi,xmm7  ;.tabl
     shl     eax,3
     shl     ebx,3
     mov     edx,[esi+eax]
     add     edx,[esi+ebx]
     shr     edx,1
     mov     ecx,.i
     pop     ebx
   .nnnx:
     cmp     ecx,ebx  ;.j
     jnle    .srt
     movd    esi,xmm7  ;.tabl
     mov     eax,ecx   ;.i
     shl     eax,3
     add     esi,eax
     dec     ecx
     sub     esi,8
   @@:
     add     esi,8
     inc     ecx  ;.i
     cmp     [esi],edx  ;.pivot
     jl      @b
     movd    edi,xmm7
     mov     eax,ebx  ;.j
     shl     eax,3
     add     edi,eax
     add     edi,8
     inc     ebx
   @@:
     sub     edi,8
     dec     ebx   ;.j
     cmp     [edi],edx  ;.pivot
     jg      @b
     cmp     ecx,ebx  ;.j
     jnle    @f           ;  jg   ??
     movlps  xmm0,[esi]
     movlps  xmm1,[edi]
     movlps  [esi],xmm1
     movlps  [edi],xmm0
     inc     ecx  ;.i
     dec     ebx  ;.j
    @@:
     jmp     .nnnx
 .srt:
     mov     .i,ecx
     push    ebx
     mov     eax,.first
     cmp     eax,ebx  ;.j
     jge     @f
     call    qsort_hybrid
   @@:
     mov     eax,.i
     cmp     eax,.last
     jge     @f
     mov     ebx,.last
     call     qsort_hybrid
 @@:
 .nxp:
     mov     esp,ebp
     pop     ebp
ret
;==================================================================================
qsort_hybrid_d:
; sort dwords
; sort only to cutoff moment - such pre sorted tab -
; - sort again using insert sort
; in:
;       eax -  first
;       ebx -  last
;       xmm7 - table
    push   ebp
    mov    ebp,esp
   .i      equ dword[ebp-4]
   .last   equ dword[ebp-8]
   .first  equ dword[ebp-12]
   .j      equ dword[ebp-16]
    push   eax
    push   ebx
    push   eax
    push   ebx
    mov    edx,ebx
    sub    edx,eax
    cmp    edx,5
    jl     .nxp           ; time to next phase sorting
                          ; not now
    movd   esi,xmm7  ;.tabl
    shl    eax,2
    shl    ebx,2
    mov    edx,[esi+eax]
    add    edx,[esi+ebx]
    shr    edx,1
    mov    ecx,.i
;   mov    ebx,.j
    pop    ebx
  .nnnx:
    cmp    ecx,ebx  ;.j
    jnle   .srt
    movd   esi,xmm7  ;.tabl
    mov    eax,ecx   ;.i
    shl    eax,2
    add    esi,eax
    dec    ecx
    sub    esi,4
   @@:
    add    esi,4
    inc    ecx  ;.i
    cmp    [esi],edx  ;.pivot
    jl     @b
    movd   edi,xmm7
    mov    eax,ebx  ;.j
    shl    eax,2
    add    edi,eax
    add    edi,4
    inc    ebx
   @@:
    sub    edi,4
    dec    ebx
    cmp    [edi],edx  ;.pivot
    jg     @b
    cmp    ecx,ebx
    jnle   @f
    movlps xmm0,[esi]
    movlps xmm1,[edi]
    movss  [esi],xmm1
    movss  [edi],xmm0
    inc    ecx
    dec    ebx
   @@:
    jmp    .nnnx
 .srt:
    mov    .i,ecx
;   mov    .j,ebx
    push   ebx
    mov    eax,.first
    cmp    eax,ebx  ;.j
    jge    @f
;   add    esp,8
    call   qsort_hybrid_d
  @@:
    mov    eax,.i
    cmp    eax,.last
    jge    @f
    mov    ebx,.last
    call   qsort_hybrid_d
  @@:
  .nxp:
    mov    esp,ebp
    pop    ebp
ret
;==========================================================================
mix_tab:
; in:
;      esi - in tab
;      edi - out tab
;      ecx - unit number, unit 8 bytes
   push     ebp
   mov      ebp,esp
   .dst     equ dword[ebp-4]
   .src     equ dword[ebp-8]
   .cnt     equ dword[ebp-12]
   .cnt1    equ dword[ebp-16]
   .rest    equ dword[ebp-20]
   push     edi
   push     esi
   push     ecx
   mov      eax,ecx
   shr      eax,2
   push     eax
   and      ecx,11b
   push     ecx
   mov      ecx,6
 .lll:
   push     ecx
   mov      edi,.dst
   mov      esi,.src
   mov      ecx,.cnt1
   shl      ecx,3
   lea      eax,[esi+ecx]
   lea      ebx,[esi+ecx*2]
   lea      edx,[eax+ecx*2]
   mov      ecx,.cnt1
 @@:
   movlps   xmm0,[esi]
   movhps   xmm0,[eax]
   movlps   xmm1,[ebx]
   movhps   xmm1,[edx]
   movaps   [edi],xmm0
   movaps   [edi+16],xmm1
   add      esi,8
   add      eax,8
   add      ebx,8
   add      edx,8
   add      edi,32
   loop     @b
   mov      ecx,.rest
   jecxz    .ll2
  @@:
   movlps   xmm0,[edx]
   movlps   [edi],xmm0
   add      edx,8
   add      edi,8
   loop     @b
  .ll2:
   pop      ecx
   push     .dst  ; swap
   push     .src
   pop      .dst
   pop      .src
   loop     .lll
   mov      esp,ebp
   pop      ebp
ret
;=============================================================================
mix_tab_d:
; in:
;      esi - in tab
;      edi - out tab
;      ecx - unit number, unit 4  bytes
   push      ebp
   mov       ebp,esp
   .dst      equ dword[ebp-4]
   .src      equ dword[ebp-8]
   .cnt      equ dword[ebp-12]
   .cnt1     equ dword[ebp-16]
   .rest     equ dword[ebp-20]
   push      edi
   push      esi
   push      ecx
   mov       eax,ecx
   shr       eax,2   ; cnt1
   push      eax
   and       ecx,11b
   push      ecx
   mov       ecx,6
 .lll:
   push      ecx
   mov       edi,.dst
   mov       esi,.src
   mov       ecx,.cnt1
   shl       ecx,2
   lea       eax,[esi+ecx]
   lea       ebx,[esi+ecx*2]
   lea       edx,[eax+ecx*2]
   mov       ecx,.cnt1
 @@:
   movlps    xmm0,[esi]
   movlps    xmm1,[eax]
   movlps    xmm2,[ebx]
   movlps    xmm3,[edx]
   punpckldq xmm0,xmm1
   punpckldq xmm2,xmm3
   punpckldq xmm0,xmm2
   movups    [edi],xmm0
   add       esi,4
   add       eax,4
   add       ebx,4
   add       edx,4
   add       edi,16
   loop      @b
   mov       ecx,.rest
   jecxz     .ll2
  @@:
   movss     xmm0,[edx]
   movss     [edi],xmm0
   add       edx,4
   add       edi,4
   loop      @b
 .ll2:
   pop       ecx
   push     .dst  ; swap
   push     .src
   pop      .dst
   pop      .src
   loop     .lll
   mov      esp,ebp
   pop      ebp
ret
if 0
;=========================================================
sort_hybrid_q:

; in:  esi - 1st table
;      edi - sec table
;      ecx - tables_units count - unit 8 bytes
; out:
;      sorted first table
   push        ebp
   mov         ebp,esp
   .first_tab  equ  [ebp-4]
   .sec_tab    equ  dword[ebp-8]
   .units_cnt  equ  dword[ebp-12]

   push    esi
   push    edi
   push    ecx
   sub     esp,8
   cmp     ecx,100
   jna     @f
   call    mix_tab          ; random list if needed
  @@:
   cmp     .units_cnt,5000
   jna     @f
   movd    xmm5,esp           ; change/increase stack
   mov     esp,.sec_tab
   mov     ebx,.units_cnt
   shl     ebx,2
   add     esp,ebx
  @@:
   mov     eax,1
   mov     ebx,.units_cnt
   dec     ebx
   movd    xmm7,.first_tab
   call    qsort_hybridq
   cmp     .units_cnt,5000
   jna     @f
   movd    esp,xmm5
  @@:
; sort  again
; next phase
   mov      ebx,.first_tab
   mov      ecx,.units_cnt
   mov      esi,ecx
   shl      esi,3
   add      esi,ebx
 .ccc:                                  ; ebx    - vert index
   movlps   xmm6,[ebx+8]                 ; ebx+4  - tri index
   movlps   xmm5,[ebx]
   pcmpgtq  xmm5,xmm6
   movmskps edx,xmm5
   and       edx,1
   or        edx,edx
   jz        .g
   movlps   xmm0,[ebx+8]
   push     ebx
 .c22:
   cmp      ebx,esi
   jae      .done
   cmp      ebx,.first_tab
   jb       .done
   movlps   xmm4,[ebx]
   pcmpgtq  xmm4,xmm6
   movmskps edx,xmm4
   and       edx,1
   or        edx,edx
   jz       .done
;   cmp      eax,[ebx]
;   jae      .done
   movlps   xmm7,[ebx]
   movlps   [ebx+8],xmm7
   sub      ebx,8
   jnc      .c22
   add      ebx,8
 .done:
   movlps   [ebx+8],xmm0
 .p:
   pop      ebx
 .g:
   add      ebx,8
   dec      ecx
   cmp      ecx,1
   jnz      .ccc
 .end:
   mov      esp,ebp
   pop      ebp
ret

;======================================================================
qsort_hybridq:
; sort only to cutoff moment - such pre sorted tab
; - sort again using insert sort
; in:
;       eax - first
;       ebx - last
;       xmm7 - table
     push    ebp
     mov     ebp,esp
    .i       equ dword[ebp-4]
    .last    equ dword[ebp-8]
    .first   equ dword[ebp-12]
    .j       equ dword[ebp-16]
     push    eax
     push    ebx
     push    eax
     push    ebx
     mov     edx,ebx
     sub     edx,eax
     cmp     edx,5
     jl      .nxp        ; time to next phase sorting ?
     movd    esi,xmm7  ;.tabl
     shl     eax,3
     shl     ebx,3
  ;   mov     edx,[esi+eax]
  ;   add     edx,[esi+ebx]
     movlps  xmm2,[esi+eax]
     movlps  xmm3,[esi+eax]
     paddq   xmm3,xmm2
     psrlq   xmm3,1
 ;    shr     edx,1
     mov     ecx,.i
     pop     ebx
   .nnnx:
     cmp     ecx,ebx  ;.j
     jnle    .srt
     movd    esi,xmm7  ;.tabl
     mov     eax,ecx   ;.i
     shl     eax,3
     add     esi,eax
     dec     ecx
     sub     esi,8
   @@:
     add     esi,8
     inc     ecx  ;.i
     movlps  xmm4,[esi]
   ; cmp     [esi],edx  ;.pivot
   ; jl      @b
     pcmpgtq xmm4,xmm3
     movmskps edx,xmm4
     and     edx,1
     or      edx,edx
     jz      @b
     movd    edi,xmm7
     mov     eax,ebx  ;.j
     shl     eax,3
     add     edi,eax
     add     edi,8
     inc     ebx
   @@:
     sub     edi,8
     dec     ebx   ;.j
     movlps  xmm4,[edi]
     pcmpgtq xmm4,xmm3
     movmskps edx,xmm4
     and     edx,1
     or      edx,edx
     jnz     @b

;    cmp     [edi],edx  ;.pivot
;    jg      @b
     cmp     ecx,ebx  ;.j
     jnle    @f           ;  jg   ??
     movlps  xmm0,[esi]
     movlps  xmm1,[edi]
     movlps  [esi],xmm1
     movlps  [edi],xmm0
     inc     ecx  ;.i
     dec     ebx  ;.j
   @@:
     jmp     .nnnx
   .srt:
     mov     .i,ecx
     push    ebx
     mov     eax,.first
     cmp     eax,ebx  ;.j
     jge     @f
     call    qsort_hybridq
   @@:
     mov     eax,.i
     cmp     eax,.last
     jge     @f
     mov     ebx,.last
     call    qsort_hybridq
   @@:
   .nxp:
     mov     esp,ebp
     pop     ebp
ret
;==========================
end if